import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
from scipy.optimize import minimize
import seaborn as sns

# File paths for your datasets
file_paths = [
    r"C:/Users/HP/Desktop/quantitative_easing_financial_data.csv",
    r"C:/Users/HP/Desktop/processed_brexit_features.csv",
    r"C:/Users/HP/Desktop/processed_covid_19_crisis_features.csv",
    r"C:/Users/HP/Desktop/processed_dodd_frank_act_features.csv",
    r"C:/Users/HP/Desktop/processed_eurozone_debt_crisis_features.csv",
    r"C:/Users/HP/Desktop/processed_global_financial_crisis_features.csv",
    r"C:/Users/HP/Desktop/processed_quantitative_easing_features.csv"
]

# Load and concatenate datasets
dfs = [pd.read_csv(path) for path in file_paths]
data = pd.concat(dfs, ignore_index=True)

# Identify the correct date column dynamically
date_column_candidates = ['Date', 'date', 'Timestamp', 'timestamp']
date_column = None

for col in date_column_candidates:
    if col in data.columns:
        date_column = col
        break

if date_column is None:
    raise ValueError(f"No date column found. Please ensure one of {date_column_candidates} exists in the dataset.")

# Convert the date column to datetime and set it as the index
data[date_column] = pd.to_datetime(data[date_column], errors='coerce')
data.dropna(subset=[date_column], inplace=True)  # Remove rows with invalid dates
data.set_index(date_column, inplace=True)
data.sort_index(inplace=True)

# Handle Missing Values
data.fillna(method='ffill', inplace=True)

# Set the target variable
target_variable = 'Stock_Index'  # Replace with your actual target variable

if target_variable not in data.columns:
    raise ValueError(f"Target variable '{target_variable}' not found in dataset.")

# Visualization: Time Series Plot
plt.figure(figsize=(12, 6))
plt.plot(data[target_variable], label=f'{target_variable} Time Series', color='blue')
plt.title(f'{target_variable} Over Time')
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend()
plt.show()

# Seasonal decomposition
result = seasonal_decompose(data[target_variable], model='additive', period=24)  # Adjust period if needed
result.plot()
plt.show()

# Perform stationarity test (ADF Test)
adf_result = adfuller(data[target_variable].dropna())
print("ADF Test Statistic:", adf_result[0])
print("p-value:", adf_result[1])
print("Critical Values:", adf_result[4])

# Visualization: ACF and PACF plots for model diagnostics
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

plt.figure(figsize=(12, 6))
plt.subplot(121)
plot_acf(data[target_variable].dropna(), ax=plt.gca(), lags=40)
plt.title('Autocorrelation Function (ACF)')

plt.subplot(122)
plot_pacf(data[target_variable].dropna(), ax=plt.gca(), lags=40)
plt.title('Partial Autocorrelation Function (PACF)')

plt.tight_layout()
plt.show()

# Fit ARIMA Model
arima_order = (1, 1, 1)  # Replace with your desired order
arima_model = ARIMA(data[target_variable].dropna(), order=arima_order)
arima_fit = arima_model.fit()
print(arima_fit.summary())

# Visualization: ARIMA Residuals
residuals = arima_fit.resid
plt.figure(figsize=(12, 6))

# Residual Distribution
plt.subplot(121)
sns.histplot(residuals, kde=True, bins=30, color='purple')
plt.title('Residual Distribution')
plt.xlabel('Residuals')
plt.ylabel('Frequency')

# Residual Time Series
plt.subplot(122)
plt.plot(residuals, color='orange')
plt.title('Residuals Over Time')
plt.xlabel('Time')
plt.ylabel('Residuals')

plt.tight_layout()
plt.show()

# Forecast Visualization
forecast_steps = 50  # Set number of steps to forecast
forecast = arima_fit.forecast(steps=forecast_steps)
plt.figure(figsize=(12, 6))
plt.plot(data[target_variable][-200:], label='Observed', color='blue')
plt.plot(forecast, label='Forecast', color='red')
plt.title(f'{target_variable} Forecast (ARIMA)')
plt.legend()
plt.show()

# Maximum Likelihood Estimation Example
def log_likelihood(params):
    mu, sigma = params
    residuals = data[target_variable] - mu
    ll = -0.5 * np.sum(np.log(2 * np.pi * sigma ** 2) + (residuals ** 2) / (sigma ** 2))
    return -ll  # Negative log likelihood for minimization

initial_params = [0, 1]  # Initial guesses for mu and sigma
mle_result = minimize(log_likelihood, initial_params, bounds=[(-10, 10), (1e-5, 10)])
print("MLE Results:", mle_result)

# Visualization: MLE Optimized Distribution
mu_opt, sigma_opt = mle_result.x
x = np.linspace(data[target_variable].min(), data[target_variable].max(), 1000)
y = (1 / (np.sqrt(2 * np.pi * sigma_opt**2))) * np.exp(-0.5 * ((x - mu_opt) / sigma_opt) ** 2)

plt.figure(figsize=(12, 6))
sns.histplot(data[target_variable], kde=False, bins=30, label='Data Distribution', color='blue', stat='density')
plt.plot(x, y, label=f'Fitted Distribution\n(mu={mu_opt:.2f}, sigma={sigma_opt:.2f})', color='red')
plt.title('MLE Fitted Normal Distribution')
plt.legend()
plt.show()

# Save processed data for further use
data.to_csv(r"C:/Users/HP/Desktop/processed_combined_data.csv")
